clear all 
set more off 

*---------------------------------------------------------------------*
* MULTIPLE IMPUTATIONS --- 1910-1930 (1940 CENSUS DATA COMES FROM NBER SERVER)
*---------------------------------------------------------------------*

* Bring in data
	use "$Mydirectory1/3_Output/TSIV_2Samples_1940Census.dta", clear 
	
* Income from income scores 
	local log_dad_score "log_father_HHinc_1936fix"
	
* Income from Census microdata 
	local log_dad "log_father_income_1936"

* Select sample 
	keep if survey1936==1 | (census==1 & problem_occs!=1) | surveys==1
	replace census=1 if survey1936==1
	tab fatheroccej census
	
* Variables for saving estimates
	forval i=1(1)2 {
		gen est_`i'=. 
		gen est_lb_`i' =.
		gen est_ub_`i' =.
		
		gen cov_`i'=.
		gen var_x_`i'=.
		gen var_y_`i'=.
	}
	
* Keep triplets in both surveys 
	egen triplet= group(fatheroccej race south_merge)
	
	bysort triplet census: gen tag = _n==1
	bysort triplet: egen number_surveys = sum(tag)
	keep if number_surveys==2
	
* Make dummy variables 
	drop triplet
	egen triplet= group(fatheroccej race south_merge)
	quietly tab triplet, gen(trip_)
	
	tempfile restricted 
	save `restricted'
	
* Loop over each cohort; run baseline regression 
	forval i=1910(10)1930 {
		
		noisily display "Baseline approach, `i'"
			
		preserve
			
		keep if surveys==1 & decade==`i' 

		//summary stats 
		sum `log_dad_score' [aw=wgt_sex_race], d
		local var_y = `r(Var)'
		sum log_son_baseline [aw=wgt_sex_race], d
		local var_x = `r(Var)'
		
		corr log_son_baseline `log_dad_score' [aw=wgt_sex_race], c
		local cov_baseline = `r(cov_12)'
		
		//baseline regression 
		reg log_son_baseline `log_dad_score' [aw=wgt_sex_race], robust  
		
	* Save 
		restore 
		
		replace est_1 = _b[`log_dad_score'] if decade==`i'
		replace est_ub_1 = _b[`log_dad_score']+1.96*_se[`log_dad_score'] if decade==`i'
		replace est_lb_1 = _b[`log_dad_score']-1.96*_se[`log_dad_score'] if decade==`i'
		
		replace cov_1 = `cov_baseline' if decade==`i'
		replace var_x_1 = `var_x' if decade==`i'
		replace var_y_1 = `var_y' if decade==`i'
		
	}

	* Create "results" dataset 
	bysort decade: keep if _n==1
	keep decade est_* cov_* var_x_* var_y_*
	
	tempfile results1
	save `results1'
	
* Now do multiple imputations on full dataset 

	noisily display "Impute values"

	use `restricted', clear
	
	sum log_son_baseline [aw=wgt_sex_race], d
	local var_x = `r(Var)'

	mi set wide
	mi register imputed `log_dad'
	
	local reps "100"
	mi impute regress `log_dad' trip_*, add(`reps') rseed(819016)
	
	forval i=1910(10)1930 {
		
		noisily display "Estimates using imputations, `i'"
			
		preserve
			
		keep if surveys==1 & decade==`i' 

    * Grab the variances from each imputation 		
		gen variance=.
		forval j=1(1)`reps' {
			quietly sum _`j'_`log_dad' [aw=wgt_sex_race], d 
			quietly replace variance = `r(Var)' if _n==`j'
		} 
		egen avg_variance = mean(variance) //avg variance across imputations 
		sum avg_variance 
		local var_yp = `r(mean)'
		
		mi estimate, post: reg log_son_baseline `log_dad' [pw=wgt_sex_race], robust 	
		
		mat m= r(table)
		mat list m
		scalar coeff1=m[1,1]
		scalar lb = m[5,1]
		scalar ub = m[6,1]
		
		display "Display covariance"
		scalar cov_imp= coeff1*`var_yp' //Back out covariance
		display cov_imp
		
	* Save these estimates in "results" dataset 
		use `results1', clear
		
		replace est_2 = coeff1 if decade==`i'
		replace est_ub_2 = ub if decade==`i'
		replace est_lb_2 = lb if decade==`i'

		replace cov_2 = cov_imp if decade==`i'
		replace var_x_2 = `var_x' if decade==`i'
		replace var_y_2 = `var_yp' if decade==`i'
		
		sum cov_2 
		local imp = `r(mean)'
		display "COMPARE `imp' with `cov_baseline'"
		
		tempfile results1
		save `results1'
		
		restore
		
	}
	
* Bring in results and save 
	use `results1', clear
	
	reshape long est_ est_lb_ est_ub_ cov_ var_x_ var_y_, i(decade) j(estimate)
	replace decade= decade+2 if estimate==2
	drop if decade==. | est_==.	
	
	save "$Mydirectory2/appendix_b/imputed_values_results_1940_1936fix.dta", replace 
